This analysis utilizes the data found in a Kaggle competition where competitors seek to predict housing prices in King County, WA, USA (https://bit.ly/2lRv48E). THe dataset provides fields including the number of bedrooms, number of bathrooms, number of floors, square footage of the living room, square footage of the overall lot, among others. The timeframe this dataset covers are the years 2014 through 2015. Therefore, this analysis will utilize a number of machine learning models and techniques in order to achieve the best possible models (in this instance measured using MAPE, MAE, MSE, and R-Squared).

Data Import

raw_train_df <- fread('Data/house_price_train.csv', stringsAsFactors = F)
raw_test_df <- fread('Data/house_price_test.csv', stringsAsFactors = F)

str(raw_train_df)
## Classes 'data.table' and 'data.frame':   17277 obs. of  21 variables:
##  $ id           :integer64 9183703376 464000600 2224079050 6163901283 6392003810 7974200948 2426059124 2115510300 ... 
##  $ date         : chr  "5/13/2014" "8/27/2014" "7/18/2014" "1/30/2015" ...
##  $ price        : num  225000 641250 810000 330000 530000 ...
##  $ bedrooms     : int  3 3 4 4 4 4 4 3 4 3 ...
##  $ bathrooms    : num  1.5 2.5 3.5 1.5 1.75 3.5 3.25 2.25 2.5 1.5 ...
##  $ sqft_living  : int  1250 2220 3980 1890 1814 3120 4160 1440 2250 2540 ...
##  $ sqft_lot     : int  7500 2550 209523 7540 5000 5086 47480 10500 6840 9520 ...
##  $ floors       : num  1 3 2 1 1 2 2 1 2 1 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 2 2 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 3 3 4 4 3 3 3 3 3 ...
##  $ grade        : int  7 10 9 7 7 9 10 8 9 8 ...
##  $ sqft_above   : int  1250 2220 3980 1890 944 2480 4160 1130 2250 1500 ...
##  $ sqft_basement: int  0 0 0 0 870 640 0 310 0 1040 ...
##  $ yr_built     : int  1967 1990 2006 1967 1951 2008 1995 1983 1987 1959 ...
##  $ yr_renovated : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ zipcode      : int  98030 98117 98024 98155 98115 98115 98072 98023 98058 98115 ...
##  $ lat          : num  47.4 47.7 47.6 47.8 47.7 ...
##  $ long         : num  -122 -122 -122 -122 -122 ...
##  $ sqft_living15: int  1260 2200 2220 1890 1290 1880 3400 1510 2480 1870 ...
##  $ sqft_lot15   : int  7563 5610 65775 8515 5000 5092 40428 8125 7386 6800 ...
##  - attr(*, ".internal.selfref")=<externalptr>
summary(raw_train_df)
##        id                 date               price        
##  Min.   :   1000102   Length:17277       Min.   :  78000  
##  1st Qu.:2113701080   Class :character   1st Qu.: 320000  
##  Median :3902100205   Mode  :character   Median : 450000  
##  Mean   :4566440237                      Mean   : 539865  
##  3rd Qu.:7302900090                      3rd Qu.: 645500  
##  Max.   :9900000190                      Max.   :7700000  
##     bedrooms        bathrooms      sqft_living       sqft_lot      
##  Min.   : 1.000   Min.   :0.500   Min.   :  370   Min.   :    520  
##  1st Qu.: 3.000   1st Qu.:1.750   1st Qu.: 1430   1st Qu.:   5050  
##  Median : 3.000   Median :2.250   Median : 1910   Median :   7620  
##  Mean   : 3.369   Mean   :2.114   Mean   : 2080   Mean   :  15186  
##  3rd Qu.: 4.000   3rd Qu.:2.500   3rd Qu.: 2550   3rd Qu.:  10695  
##  Max.   :33.000   Max.   :8.000   Max.   :13540   Max.   :1164794  
##      floors        waterfront            view          condition    
##  Min.   :1.000   Min.   :0.000000   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:3.000  
##  Median :1.500   Median :0.000000   Median :0.0000   Median :3.000  
##  Mean   :1.493   Mean   :0.007467   Mean   :0.2335   Mean   :3.413  
##  3rd Qu.:2.000   3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:4.000  
##  Max.   :3.500   Max.   :1.000000   Max.   :4.0000   Max.   :5.000  
##      grade         sqft_above   sqft_basement       yr_built   
##  Min.   : 3.00   Min.   : 370   Min.   :   0.0   Min.   :1900  
##  1st Qu.: 7.00   1st Qu.:1190   1st Qu.:   0.0   1st Qu.:1951  
##  Median : 7.00   Median :1564   Median :   0.0   Median :1975  
##  Mean   : 7.66   Mean   :1791   Mean   : 289.4   Mean   :1971  
##  3rd Qu.: 8.00   3rd Qu.:2210   3rd Qu.: 556.0   3rd Qu.:1997  
##  Max.   :13.00   Max.   :9410   Max.   :4820.0   Max.   :2015  
##   yr_renovated        zipcode           lat             long       
##  Min.   :   0.00   Min.   :98001   Min.   :47.16   Min.   :-122.5  
##  1st Qu.:   0.00   1st Qu.:98033   1st Qu.:47.47   1st Qu.:-122.3  
##  Median :   0.00   Median :98065   Median :47.57   Median :-122.2  
##  Mean   :  85.35   Mean   :98078   Mean   :47.56   Mean   :-122.2  
##  3rd Qu.:   0.00   3rd Qu.:98117   3rd Qu.:47.68   3rd Qu.:-122.1  
##  Max.   :2015.00   Max.   :98199   Max.   :47.78   Max.   :-121.3  
##  sqft_living15    sqft_lot15    
##  Min.   : 460   Min.   :   659  
##  1st Qu.:1490   1st Qu.:  5100  
##  Median :1840   Median :  7639  
##  Mean   :1986   Mean   : 12826  
##  3rd Qu.:2360   3rd Qu.: 10080  
##  Max.   :6210   Max.   :871200
head(raw_train_df)
##            id      date  price bedrooms bathrooms sqft_living sqft_lot
## 1: 9183703376 5/13/2014 225000        3      1.50        1250     7500
## 2:  464000600 8/27/2014 641250        3      2.50        2220     2550
## 3: 2224079050 7/18/2014 810000        4      3.50        3980   209523
## 4: 6163901283 1/30/2015 330000        4      1.50        1890     7540
## 5: 6392003810 5/23/2014 530000        4      1.75        1814     5000
## 6: 7974200948 5/20/2014 953007        4      3.50        3120     5086
##    floors waterfront view condition grade sqft_above sqft_basement
## 1:      1          0    0         3     7       1250             0
## 2:      3          0    2         3    10       2220             0
## 3:      2          0    2         3     9       3980             0
## 4:      1          0    0         4     7       1890             0
## 5:      1          0    0         4     7        944           870
## 6:      2          0    0         3     9       2480           640
##    yr_built yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
## 1:     1967            0   98030 47.3719 -122.215          1260       7563
## 2:     1990            0   98117 47.6963 -122.393          2200       5610
## 3:     2006            0   98024 47.5574 -121.890          2220      65775
## 4:     1967            0   98155 47.7534 -122.318          1890       8515
## 5:     1951            0   98115 47.6840 -122.281          1290       5000
## 6:     2008            0   98115 47.6762 -122.288          1880       5092
#Check for null values
sum(is.na(raw_train_df))
## [1] 0
sum(is.na(raw_test_df))
## [1] 0

Initial Cleaning

As time series modeling will not be utilized in this analysis, the day, month, and year of each purchase will be individually parsed out rather than using the datetime field.

clean_train_df <- raw_train_df
clean_test_df <- raw_test_df

# Train Data set
clean_train_df$date <- as.Date(raw_train_df$date, "%m/%d/%Y")

clean_train_df$year <- year(clean_train_df[,clean_train_df$date])
clean_train_df$month <- month(clean_train_df[,clean_train_df$date])
clean_train_df$day <- day(clean_train_df[,clean_train_df$date])
clean_train_df$day_of_week <- as.POSIXlt(as.Date(clean_train_df$date, "%m/%d/%Y"))$wday

# Test Data Set
clean_test_df$date <- as.Date(clean_test_df$date, "%m/%d/%Y")

clean_test_df$year <- year(clean_test_df[,clean_test_df$date])
clean_test_df$month <- month(clean_test_df[,clean_test_df$date])
clean_test_df$day <- day(clean_test_df[,clean_test_df$date])
clean_test_df$day_of_week <- as.POSIXlt(as.Date(clean_test_df$date, "%m/%d/%Y"))$wday

Exploratory Analysis

Histograms

In order readable visualizations, a random selection of 1000 houses will be taken for all subsequent graphs/charts.

set.seed(12345)

sqft_hist <- c('sqft_living', 'sqft_lot', 'sqft_above', 'sqft_living15', 'sqft_lot15')
stats_hist <- c('bedrooms', 'floors','condition', 'grade')

#Randomly Sample 1000 values
df.1000 <- clean_train_df[sample(nrow(clean_train_df), 1000),]

multiple_hist(df.1000, sqft_hist)

multiple_hist(df.1000, stats_hist)

single_hist(df.1000$yr_built, "Year Built")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

single_hist(df.1000$yr_renovated, "Year Renovated")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

single_hist(df.1000$price, "Price")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

single_hist(df.1000$sqft_basement, "Basement Area")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Scatterplots

#Create dataframe with only numerical variables
numerical_var <- c('bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'sqft_above', 'sqft_basement', 'yr_built', 'sqft_living15', 'sqft_lot15', 'price')
scatter_df <- clean_train_df[,..numerical_var]
var_list <- names(scatter_df)[1:(length(scatter_df)-1)]

#Create list of ggplots of each numerical variable against price
plot_list <- lapply(var_list, gg_scatter, df = scatter_df)
do.call(grid.arrange, plot_list)

Geographic Analysis

The following map allows for the visualization of the where houses of different price bands (based on quartiles) are located.

#Bin into quartiles for data visualization
df.1000$bin <- factor(Hmisc::cut2(df.1000$price, g = 4), labels = c(1:4))

colorsmap <- colors()[c(490,24,100,657)]
map <- leaflet(data.frame(df.1000)) %>%
  addTiles() %>%  # Add default OpenStreetMap map tiles
  addCircleMarkers(lng=~long, lat=~lat,
                   popup= paste0("Number of Bedrooms: ", df.1000$bedrooms, sep="\n",
                                 "Number of Bathrooms: ", df.1000$bathrooms, sep="\n",
                                 "Living Room Size: " , df.1000$sqft_living, sep="\n",
                                 "Lot Size: ", df.1000$sqft_lot, sep="\n",
                                 "Number of Floors: ", df.1000$floors, sep="\n",
                                 "Current Condition: ", df.1000$condition),
                   color= ~colorsmap,
                   group= unique(df.1000$bin)) #%>% 
# This seems to be no longer supported
  # addLegend(position = 'bottomright', colors = colorsmap, labels = unique(df.1000$bin))

#addLegend(map, position = 'bottomright', colors = colorsmap, labels = unique(df.1000$bin))

map

Outlier Analysis

Univariate Analysis

The following charts look at the distributions of each numerical value to visually see outliers

for (var in numerical_var[1:(length(numerical_var)-1)]){
  univariate_outlier(clean_test_df, var)
}

## [1] "Outliers:  5"

## [1] "Outliers:  9"

## [1] "Outliers:  93"

## [1] "Outliers:  433"

## [1] "Outliers:  0"

## [1] "Outliers:  76"

## [1] "Outliers:  50"

## [1] "Outliers:  0"

## [1] "Outliers:  67"

## [1] "Outliers:  386"

Bivariate Analysis

The following charts look at the distributions of each numerical value to visually see outliers on a monthly and daily basis.

for (var in numerical_var[1:(length(numerical_var)-1)]){
  bivariate_outlier(clean_test_df, var)
}

Baseline Model Comparisons

This analysis will initially compare the results of Lasso Linear Regression, Ranger’s implementation of Random Forest, and XG Boost to determine which algorithm will be used going forward.

Lasso Linear Regression

split_clean_train_df <- f_partition(clean_train_df, test_proportion = 0.2, seed = 123456)
split_clean_train_df$train$date = NULL
split_clean_train_df$test$date = NULL

glmnet_cv<-cv.glmnet(x = data.matrix(split_clean_train_df$train[, !c('id','price')]), nfolds = 5, 
                     y = split_clean_train_df$train[['price']],
                     alpha=1, family = 'gaussian', standardize = T)
plot.cv.glmnet(glmnet_cv)

glmnet_cv$lambda.min
## [1] 450.4951
glmnet_0<-glmnet(x = data.matrix(split_clean_train_df$train[, !c('id','price')]), 
                 y = split_clean_train_df$train[['price']],
                 family = 'gaussian',
                 alpha=1, lambda = glmnet_cv$lambda.min)

print(glmnet_0)
## 
## Call:  glmnet(x = data.matrix(split_clean_train_df$train[, !c("id",      "price")]), y = split_clean_train_df$train[["price"]], family = "gaussian",      alpha = 1, lambda = glmnet_cv$lambda.min) 
## 
##      Df   %Dev Lambda
## [1,] 20 0.7089  450.5
glmnet_0$beta
## 22 x 1 sparse Matrix of class "dgCMatrix"
##                          s0
## bedrooms      -2.900095e+04
## bathrooms      3.532356e+04
## sqft_living    1.384260e+02
## sqft_lot       4.452816e-02
## floors         4.680427e+03
## waterfront     5.724114e+05
## view           5.132169e+04
## condition      2.684994e+04
## grade          9.788737e+04
## sqft_above     3.474208e+01
## sqft_basement  .           
## yr_built      -2.509573e+03
## yr_renovated   2.116641e+01
## zipcode       -5.441303e+02
## lat            5.964963e+05
## long          -2.236335e+05
## sqft_living15  2.576933e+01
## sqft_lot15    -3.056068e-01
## year           3.130578e+04
## month          .           
## day           -2.188929e+02
## day_of_week    1.934748e+03
test_glmnet<-predict(glmnet_0, newx = data.matrix(split_clean_train_df$test[,!c('id','price')]))

df_pred<-split_clean_train_df$test[, .(id=1:.N, price, test_glmnet)]
str(df_pred)
## Classes 'data.table' and 'data.frame':   3456 obs. of  3 variables:
##  $ id         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price      : num  810000 953007 495000 1080000 705000 ...
##  $ test_glmnet: num  976066 802036 199080 970489 852231 ...
##  - attr(*, ".internal.selfref")=<externalptr>
rmse_glmnet<-rmse(real=split_clean_train_df$test$price, predicted = test_glmnet)
mae_glmnet<-mae(real=split_clean_train_df$test$price, predicted = test_glmnet)
mape_glmnet<-mape(real=split_clean_train_df$test$price, predicted = test_glmnet)
mape_glmnet
## [1] 0.2522237
rsq_glment<-custom_rsq(real=split_clean_train_df$test$price, predicted = test_glmnet)
rsq_glment
## [1] 0.699868

Ranger Random Forest

baseline_rf <- ranger(formula = as.formula(price~.), data=split_clean_train_df$train[,!c('id')], importance = 'impurity')
print(baseline_rf)
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = split_clean_train_df$train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      13821 
## Number of independent variables:  22 
## Mtry:                             4 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       18126157726 
## R squared (OOB):                  0.8589635
test_rf<-predict(baseline_rf, data = split_clean_train_df$test, type='response')$predictions

df_pred<-cbind(df_pred, test_rf)
str(df_pred)
## Classes 'data.table' and 'data.frame':   3456 obs. of  4 variables:
##  $ id         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price      : num  810000 953007 495000 1080000 705000 ...
##  $ test_glmnet: num  976066 802036 199080 970489 852231 ...
##  $ test_rf    : num  914609 1062291 412301 919651 664950 ...
##  - attr(*, ".internal.selfref")=<externalptr>
rmse_rf<-rmse(real=split_clean_train_df$test$price, predicted = test_rf)
mae_rf<-mae(real=split_clean_train_df$test$price, predicted = test_rf)
mape_rf<-mape(real=split_clean_train_df$test$price, predicted = test_rf)
mape_rf
## [1] 0.1342545
rsq_rf<-custom_rsq(real=split_clean_train_df$test$price, predicted = test_rf)
rsq_rf
## [1] 0.8429682

XG Boost

xgb_reg_0<-xgboost(booster='gblinear',
                   data=data.matrix(split_clean_train_df$train[, !c('id','price'), with=F]),
                   label=split_clean_train_df$train$price,
                   nrounds = 100,
                   objective='reg:linear')
## [1]  train-rmse:285660.312500 
## [2]  train-rmse:271871.593750 
## [3]  train-rmse:264200.625000 
## [4]  train-rmse:259061.828125 
## [5]  train-rmse:255394.187500 
## [6]  train-rmse:252694.765625 
## [7]  train-rmse:250658.828125 
## [8]  train-rmse:249084.671875 
## [9]  train-rmse:247832.375000 
## [10] train-rmse:246807.906250 
## [11] train-rmse:245944.562500 
## [12] train-rmse:245195.015625 
## [13] train-rmse:244527.125000 
## [14] train-rmse:243919.750000 
## [15] train-rmse:243355.421875 
## [16] train-rmse:242824.765625 
## [17] train-rmse:242319.859375 
## [18] train-rmse:241836.250000 
## [19] train-rmse:241370.312500 
## [20] train-rmse:240919.375000 
## [21] train-rmse:240482.468750 
## [22] train-rmse:240058.484375 
## [23] train-rmse:239646.890625 
## [24] train-rmse:239247.390625 
## [25] train-rmse:238859.062500 
## [26] train-rmse:238481.734375 
## [27] train-rmse:238115.875000 
## [28] train-rmse:237759.703125 
## [29] train-rmse:237414.578125 
## [30] train-rmse:237079.859375 
## [31] train-rmse:236754.359375 
## [32] train-rmse:236439.359375 
## [33] train-rmse:236133.796875 
## [34] train-rmse:235837.609375 
## [35] train-rmse:235550.156250 
## [36] train-rmse:235272.187500 
## [37] train-rmse:235001.703125 
## [38] train-rmse:234740.046875 
## [39] train-rmse:234486.281250 
## [40] train-rmse:234240.406250 
## [41] train-rmse:234002.296875 
## [42] train-rmse:233770.296875 
## [43] train-rmse:233546.781250 
## [44] train-rmse:233328.921875 
## [45] train-rmse:233117.890625 
## [46] train-rmse:232912.843750 
## [47] train-rmse:232714.609375 
## [48] train-rmse:232521.625000 
## [49] train-rmse:232333.796875 
## [50] train-rmse:232151.843750 
## [51] train-rmse:231975.421875 
## [52] train-rmse:231803.765625 
## [53] train-rmse:231636.656250 
## [54] train-rmse:231474.531250 
## [55] train-rmse:231317.250000 
## [56] train-rmse:231163.781250 
## [57] train-rmse:231014.265625 
## [58] train-rmse:230869.203125 
## [59] train-rmse:230728.125000 
## [60] train-rmse:230590.203125 
## [61] train-rmse:230456.390625 
## [62] train-rmse:230325.609375 
## [63] train-rmse:230198.781250 
## [64] train-rmse:230074.156250 
## [65] train-rmse:229953.281250 
## [66] train-rmse:229835.171875 
## [67] train-rmse:229720.046875 
## [68] train-rmse:229607.750000 
## [69] train-rmse:229497.859375 
## [70] train-rmse:229390.890625 
## [71] train-rmse:229285.734375 
## [72] train-rmse:229183.906250 
## [73] train-rmse:229083.828125 
## [74] train-rmse:228986.203125 
## [75] train-rmse:228890.609375 
## [76] train-rmse:228796.640625 
## [77] train-rmse:228705.468750 
## [78] train-rmse:228615.953125 
## [79] train-rmse:228527.718750 
## [80] train-rmse:228441.828125 
## [81] train-rmse:228357.890625 
## [82] train-rmse:228274.656250 
## [83] train-rmse:228193.718750 
## [84] train-rmse:228114.015625 
## [85] train-rmse:228036.234375 
## [86] train-rmse:227959.343750 
## [87] train-rmse:227884.828125 
## [88] train-rmse:227810.859375 
## [89] train-rmse:227738.546875 
## [90] train-rmse:227667.109375 
## [91] train-rmse:227597.843750 
## [92] train-rmse:227528.718750 
## [93] train-rmse:227461.093750 
## [94] train-rmse:227394.515625 
## [95] train-rmse:227329.125000 
## [96] train-rmse:227264.921875 
## [97] train-rmse:227201.203125 
## [98] train-rmse:227139.000000 
## [99] train-rmse:227077.406250 
## [100]    train-rmse:227017.406250
print(xgb_reg_0)
## ##### xgb.Booster
## raw: 488 bytes 
## call:
##   xgb.train(params = params, data = dtrain, nrounds = nrounds, 
##     watchlist = watchlist, verbose = verbose, print_every_n = print_every_n, 
##     early_stopping_rounds = early_stopping_rounds, maximize = maximize, 
##     save_period = save_period, save_name = save_name, xgb_model = xgb_model, 
##     callbacks = callbacks, booster = "gblinear", objective = "reg:linear")
## params (as set within xgb.train):
##   booster = "gblinear", objective = "reg:linear", silent = "1"
## xgb.attributes:
##   niter
## callbacks:
##   cb.print.evaluation(period = print_every_n)
##   cb.evaluation.log()
## # of features: 22 
## niter: 100
## nfeatures : 22 
## evaluation_log:
##     iter train_rmse
##        1   285660.3
##        2   271871.6
## ---                
##       99   227077.4
##      100   227017.4
test_xgb<-predict(xgb_reg_0, newdata = data.matrix(split_clean_train_df$test[, !c('id','price'), with=F]), 
                  type='response')

df_pred<-cbind(df_pred, test_xgb)
str(df_pred)
## Classes 'data.table' and 'data.frame':   3456 obs. of  5 variables:
##  $ id         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price      : num  810000 953007 495000 1080000 705000 ...
##  $ test_glmnet: num  976066 802036 199080 970489 852231 ...
##  $ test_rf    : num  914609 1062291 412301 919651 664950 ...
##  $ test_xgb   : num  1012781 759628 400939 1131196 714877 ...
##  - attr(*, ".internal.selfref")=<externalptr>
rmse_xgb<-rmse(real=split_clean_train_df$test$price, predicted = test_xgb)
mae_xgb<-mae(real=split_clean_train_df$test$price, predicted = test_xgb)
mape_xgb<-mape(real=split_clean_train_df$test$price, predicted = test_xgb)
mape_xgb
## [1] 0.3273408
rsq_xgb<-custom_rsq(real=split_clean_train_df$test$price, predicted = test_xgb)
rsq_xgb
## [1] 0.601485

Model Comparison

As can be seen from the following charts outlining each algorithm’s prediction metrics, Random Forest proved to have superior results when compared to the other two and will be used for subsequent feature engineering and tuning.

result<-data.table(method=c('glmnet','rf','xgb_reg'),
                   rmse=sapply(df_pred[,!c('price','id')],function(x) return(rmse(real=df_pred$price, predicted=x))),
                   mae=sapply(df_pred[,!c('price','id')],function(x) return(mae(real=df_pred$price, predicted=x))),
                   mape=sapply(df_pred[,!c('price','id')],function(x) return(mape(real=df_pred$price, predicted=x))),
                   rsq=sapply(df_pred[,!c('price','id')],function(x) return(custom_rsq(real=df_pred$price, predicted=x))))

# plotting results metrics
ggplot(result, aes(x=method, y=mape))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rmse))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=mae))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rsq))+geom_bar(stat='identity')

Feature Engineering

A number of features will be created in the hopes their inclusion into the model will improve the overall prediction abilities. The features created include 1) Weekday/Weekend flag, 2) Holiday flag, 3) Renovation flag (defined as when the 2015 area of either the lot or living room is different from the original area), 4) Missing Renovation Year flag (as the presence of a renovation year should correspond to a positive renovation flag), and 5) House Age. It was found that the inclusion of the first, third, and fourth features actually improved model performance as seen from the below graphs.

1. Weekday/Weekend

df_pipeline_pred<-split_clean_train_df$test[, .(id=1:.N, price, test_rf)]
colnames(df_pipeline_pred) <-c('id','price','baseline')

fe_train_df1 <- clean_train_df
fe_test_df1 <- clean_test_df

fe_train_df1$weekend <-as.logical(is.weekend(clean_train_df$date))
fe_test_df1$weekend <-as.logical(is.weekend(clean_test_df$date))
fe_train_df1$date = NULL

fe_output_1 <- split_and_train(fe_train_df1)
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      13821 
## Number of independent variables:  23 
## Mtry:                             4 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       18293098186 
## R squared (OOB):                  0.8576645
result<-data.table(method=c('baseline','fe1'),
                   rmse=sapply(fe_output_1[[3]][,!c('price','id')],
                               function(x) return(rmse(real=fe_output_1[[3]]$price, predicted=x))),
                   mae=sapply(fe_output_1[[3]][,!c('price','id')],
                              function(x) return(mae(real=fe_output_1[[3]]$price, predicted=x))),
                   mape=sapply(fe_output_1[[3]][,!c('price','id')],
                               function(x) return(mape(real=fe_output_1[[3]]$price, predicted=x))),
                   rsq=sapply(fe_output_1[[3]][,!c('price','id')],
                               function(x) return(custom_rsq(real=fe_output_1[[3]]$price, predicted=x))))
ggplot(result, aes(x=method, y=mape))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rmse))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=mae))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rsq))+geom_bar(stat='identity')

2. Holiday

fe_train_df2 <- clean_train_df
fe_test_df2 <- clean_test_df

fe_train_df2$holiday <-as.logical(is.holiday(clean_train_df$date))
fe_test_df2$holiday <-as.logical(is.holiday(clean_test_df$date))
fe_train_df2$date = NULL

fe_output_2 <- split_and_train(fe_train_df2)
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      13821 
## Number of independent variables:  23 
## Mtry:                             4 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       18279121691 
## R squared (OOB):                  0.8577733
result<-data.table(method=c('baseline','fe1','fe2'),
                   rmse=sapply(fe_output_2[[3]][,!c('price','id')],
                               function(x) return(rmse(real=fe_output_2[[3]]$price, predicted=x))),
                   mae=sapply(fe_output_2[[3]][,!c('price','id')],
                              function(x) return(mae(real=fe_output_2[[3]]$price, predicted=x))),
                   mape=sapply(fe_output_2[[3]][,!c('price','id')],
                               function(x) return(mape(real=fe_output_2[[3]]$price, predicted=x))),
                   rsq=sapply(fe_output_2[[3]][,!c('price','id')],
                               function(x) return(custom_rsq(real=fe_output_2[[3]]$price, predicted=x))))
## Warning in data.table(method = c("baseline", "fe1", "fe2"), rmse =
## sapply(fe_output_2[[3]][, : Item 2 is of size 2 but maximum size is 3
## (recycled leaving remainder of 1 items)
## Warning in data.table(method = c("baseline", "fe1", "fe2"), rmse =
## sapply(fe_output_2[[3]][, : Item 3 is of size 2 but maximum size is 3
## (recycled leaving remainder of 1 items)
## Warning in data.table(method = c("baseline", "fe1", "fe2"), rmse =
## sapply(fe_output_2[[3]][, : Item 4 is of size 2 but maximum size is 3
## (recycled leaving remainder of 1 items)
## Warning in data.table(method = c("baseline", "fe1", "fe2"), rmse =
## sapply(fe_output_2[[3]][, : Item 5 is of size 2 but maximum size is 3
## (recycled leaving remainder of 1 items)
ggplot(result, aes(x=method, y=mape))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rmse))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=mae))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rsq))+geom_bar(stat='identity')

clean_test_df$date = NULL

3. Renovation Flag

fe_train_df3 <- clean_train_df
fe_test_df3 <- clean_test_df

fe_train_df3$renovated <- ifelse(((fe_train_df3$sqft_living != fe_train_df3$sqft_living15) | 
                                     (fe_train_df3$sqft_lot != fe_train_df3$sqft_lot15)), T, F)
fe_test_df3$rennovated <- ifelse(((fe_test_df3$sqft_living != fe_test_df3$sqft_living15) | 
                                     (fe_test_df3$sqft_lot != fe_test_df3$sqft_lot15)), T, F)

fe_output_3 <- split_and_train(fe_train_df3)
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      13821 
## Number of independent variables:  24 
## Mtry:                             4 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       18398901242 
## R squared (OOB):                  0.8568413
result<-data.table(method=c('baseline','fe1','fe2','fe3'),
                   rmse=sapply(fe_output_3[[3]][,!c('price','id')],
                               function(x) return(rmse(real=fe_output_3[[3]]$price, predicted=x))),
                   mae=sapply(fe_output_3[[3]][,!c('price','id')],
                              function(x) return(mae(real=fe_output_3[[3]]$price, predicted=x))),
                   mape=sapply(fe_output_3[[3]][,!c('price','id')],
                               function(x) return(mape(real=fe_output_3[[3]]$price, predicted=x))),
                   rsq=sapply(fe_output_3[[3]][,!c('price','id')],
                               function(x) return(custom_rsq(real=fe_output_3[[3]]$price, predicted=x))))
ggplot(result, aes(x=method, y=mape))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rmse))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=mae))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rsq))+geom_bar(stat='identity')

4. Missing Renovation Year

fe_train_df4 <- fe_train_df3
fe_test_df4 <- fe_test_df3

fe_train_df4$missing_ren_year <- ifelse(((fe_train_df4$yr_renovated == 0) & (fe_train_df4$renovated == T)), T, F)
fe_test_df4$missing_ren_year <- ifelse(((fe_test_df4$yr_renovated == 0) & (fe_test_df4$renovated == T)), T, F)

fe_output_4 <- split_and_train(fe_train_df4)
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      13821 
## Number of independent variables:  25 
## Mtry:                             5 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       17949616095 
## R squared (OOB):                  0.8603371
result<-data.table(method=c('baseline','fe1','fe2','fe3','fe4'),
                   rmse=sapply(fe_output_4[[3]][,!c('price','id')],
                               function(x) return(rmse(real=fe_output_4[[3]]$price, predicted=x))),
                   mae=sapply(fe_output_4[[3]][,!c('price','id')],
                              function(x) return(mae(real=fe_output_4[[3]]$price, predicted=x))),
                   mape=sapply(fe_output_4[[3]][,!c('price','id')],
                               function(x) return(mape(real=fe_output_4[[3]]$price, predicted=x))),
                   rsq=sapply(fe_output_4[[3]][,!c('price','id')],
                               function(x) return(custom_rsq(real=fe_output_4[[3]]$price, predicted=x))))
## Warning in data.table(method = c("baseline", "fe1", "fe2", "fe3", "fe4"), :
## Item 2 is of size 2 but maximum size is 5 (recycled leaving remainder of 1
## items)
## Warning in data.table(method = c("baseline", "fe1", "fe2", "fe3", "fe4"), :
## Item 3 is of size 2 but maximum size is 5 (recycled leaving remainder of 1
## items)
## Warning in data.table(method = c("baseline", "fe1", "fe2", "fe3", "fe4"), :
## Item 4 is of size 2 but maximum size is 5 (recycled leaving remainder of 1
## items)
## Warning in data.table(method = c("baseline", "fe1", "fe2", "fe3", "fe4"), :
## Item 5 is of size 2 but maximum size is 5 (recycled leaving remainder of 1
## items)
ggplot(result, aes(x=method, y=mape))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rmse))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=mae))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rsq))+geom_bar(stat='identity')

5. House Age

fe_train_df5 <- clean_train_df
fe_test_df5 <- clean_test_df

fe_train_df5$house_age <- year(Sys.Date()) - fe_train_df5$yr_built
fe_test_df5$house_age <- year(Sys.Date()) - fe_test_df5$yr_built

fe_output_5 <- split_and_train(fe_train_df5)
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      13821 
## Number of independent variables:  24 
## Mtry:                             4 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       18544039287 
## R squared (OOB):                  0.855712
result<-data.table(method=c('baseline','fe1','fe2','fe3','fe4','fe5'),
                   rmse=sapply(fe_output_5[[3]][,!c('price','id')],
                               function(x) return(rmse(real=fe_output_5[[3]]$price, predicted=x))),
                   mae=sapply(fe_output_5[[3]][,!c('price','id')],
                              function(x) return(mae(real=fe_output_5[[3]]$price, predicted=x))),
                   mape=sapply(fe_output_5[[3]][,!c('price','id')],
                               function(x) return(mape(real=fe_output_5[[3]]$price, predicted=x))),
                   rsq=sapply(fe_output_5[[3]][,!c('price','id')],
                               function(x) return(custom_rsq(real=fe_output_5[[3]]$price, predicted=x))))
ggplot(result, aes(x=method, y=mape))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rmse))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=mae))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rsq))+geom_bar(stat='identity')

result[which.min(result$rmse)]
##      method     rmse      mae      mape       rsq
## 1: baseline 157301.6 72817.94 0.1342545 0.8429682
result[which.min(result$mae)]
##      method     rmse      mae      mape       rsq
## 1: baseline 157301.6 72817.94 0.1342545 0.8429682
result[which.min(result$mape)]
##      method     rmse      mae      mape       rsq
## 1: baseline 157301.6 72817.94 0.1342545 0.8429682
result[which.max(result$rsq)]
##      method     rmse      mae      mape       rsq
## 1: baseline 157301.6 72817.94 0.1342545 0.8429682

Combine Best Features

fe_train_df_final <- fe_train_df1
fe_test_df_final<- fe_test_df1

fe_train_df_final$renovated <- ifelse(((fe_train_df_final$sqft_living != fe_train_df_final$sqft_living15) | 
                                     (fe_train_df_final$sqft_lot != fe_train_df_final$sqft_lot15)), T, F)
fe_test_df_final$renovated <- ifelse(((fe_test_df_final$sqft_living != fe_test_df_final$sqft_living15) | 
                                     (fe_test_df_final$sqft_lot != fe_test_df_final$sqft_lot15)), T, F)

fe_train_df_final$missing_ren_year <- ifelse(((fe_train_df_final$yr_renovated == 0) 
                                              & (fe_train_df_final$renovated == T)), T, F)
fe_test_df_final$missing_ren_year <- ifelse(((fe_test_df_final$yr_renovated == 0) 
                                             & (fe_test_df_final$renovated == T)), T, F)

fe_output_final <- split_and_train(fe_train_df_final)
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      13821 
## Number of independent variables:  25 
## Mtry:                             5 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       17830789805 
## R squared (OOB):                  0.8612617
result<-data.table(method=c('baseline','fe1','fe2','fe3','fe4','fe5','fe_final'),
                   rmse=sapply(fe_output_final[[3]][,!c('price','id')],
                               function(x) return(rmse(real=fe_output_final[[3]]$price, predicted=x))),
                   mae=sapply(fe_output_final[[3]][,!c('price','id')],
                              function(x) return(mae(real=fe_output_final[[3]]$price, predicted=x))),
                   mape=sapply(fe_output_final[[3]][,!c('price','id')],
                               function(x) return(mape(real=fe_output_final[[3]]$price, predicted=x))),
                   rsq=sapply(fe_output_final[[3]][,!c('price','id')],
                               function(x) return(custom_rsq(real=fe_output_final[[3]]$price, predicted=x))))
## Warning in data.table(method = c("baseline", "fe1", "fe2", "fe3", "fe4", :
## Item 2 is of size 2 but maximum size is 7 (recycled leaving remainder of 1
## items)
## Warning in data.table(method = c("baseline", "fe1", "fe2", "fe3", "fe4", :
## Item 3 is of size 2 but maximum size is 7 (recycled leaving remainder of 1
## items)
## Warning in data.table(method = c("baseline", "fe1", "fe2", "fe3", "fe4", :
## Item 4 is of size 2 but maximum size is 7 (recycled leaving remainder of 1
## items)
## Warning in data.table(method = c("baseline", "fe1", "fe2", "fe3", "fe4", :
## Item 5 is of size 2 but maximum size is 7 (recycled leaving remainder of 1
## items)
ggplot(result, aes(x=method, y=mape))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rmse))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=mae))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rsq))+geom_bar(stat='identity')

result[which.min(result$rmse)]
##    method     rmse      mae      mape       rsq
## 1:    fe1 156353.5 72248.24 0.1333274 0.8448555
result[which.min(result$mae)]
##    method     rmse      mae      mape       rsq
## 1:    fe1 156353.5 72248.24 0.1333274 0.8448555
result[which.min(result$mape)]
##    method     rmse      mae      mape       rsq
## 1:    fe1 156353.5 72248.24 0.1333274 0.8448555
result[which.max(result$rsq)]
##    method     rmse      mae      mape       rsq
## 1:    fe1 156353.5 72248.24 0.1333274 0.8448555

Hyperparameter Tuning

# test_rf_tuned <- csrf(
#   formula = as.formula(price~.),
#   training_data = split_clean_train_df$train[,!c('id')],
#   test_data = split_clean_train_df$test[,!c('id')],
#   params1 = list(importance = 'impurity'),
#   params2 = list(num.trees = 50)
# )

##################################################### TESTING ###################################################### 
####################################################################################################################
####################################################################################################################
final_train_df <- fe_output_final[[1]]
final_test_df <- fe_test_df_final

#Need to convert to integers as task doesn't support categoricals
#Train/Test Split
final_train_df$train$weekend <- ifelse((final_train_df$train$weekend), 1, 0)
final_train_df$test$weekend <- ifelse((final_train_df$test$weekend), 1, 0)

final_train_df$train$renovated <- ifelse((final_train_df$train$renovated), 1, 0)
final_train_df$test$renovated <- ifelse((final_train_df$test$renovated), 1, 0)

final_train_df$train$missing_ren_year <- ifelse((final_train_df$train$missing_ren_year), 1, 0)
final_train_df$test$missing_ren_year <- ifelse((final_train_df$test$missing_ren_year), 1, 0)

#Validation Split
final_test_df$weekend <- ifelse((final_test_df$weekend), 1, 0)
final_test_df$renovated <- ifelse((final_test_df$renovated), 1, 0)
final_test_df$missing_ren_year <- ifelse((final_test_df$missing_ren_year), 1, 0)

task = makeRegrTask(data = final_train_df$train[,!c('id')], target = "price")
## Warning in makeTask(type = type, data = data, weights = weights, blocking
## = blocking, : Provided data is not a pure data.frame but from class
## data.table, hence it will be converted.
# Estimate runtime
estimateTimeTuneRanger(task)
## Approximated time for tuning: 2H 43M 9S
# Tuning
res = tuneRanger(task, num.trees = 500, num.threads = 2, iters = 70, save.file.path = NULL)
## Computing y column(s) for design. Not provided.
## [mbo] 0: mtry=9; min.node.size=151; sample.fraction=0.432 : y = 2.91e+10 : 4.6 secs : initdesign
## [mbo] 0: mtry=17; min.node.size=113; sample.fraction=0.838 : y = 2.17e+10 : 18.4 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=25; sample.fraction=0.351 : y = 2.52e+10 : 3.4 secs : initdesign
## [mbo] 0: mtry=21; min.node.size=1.16e+03; sample.fraction=0.615 : y = 4.52e+10 : 4.4 secs : initdesign
## [mbo] 0: mtry=10; min.node.size=8; sample.fraction=0.563 : y = 1.69e+10 : 14.6 secs : initdesign
## [mbo] 0: mtry=9; min.node.size=84; sample.fraction=0.644 : y = 2.2e+10 : 7.8 secs : initdesign
## [mbo] 0: mtry=11; min.node.size=1.62e+03; sample.fraction=0.436 : y = 6.12e+10 : 1.9 secs : initdesign
## [mbo] 0: mtry=15; min.node.size=11; sample.fraction=0.278 : y = 1.83e+10 : 9.6 secs : initdesign
## [mbo] 0: mtry=18; min.node.size=6; sample.fraction=0.735 : y = 1.66e+10 : 32.4 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=330; sample.fraction=0.78 : y = 4.11e+10 : 3.0 secs : initdesign
## [mbo] 0: mtry=19; min.node.size=558; sample.fraction=0.461 : y = 3.95e+10 : 6.9 secs : initdesign
## [mbo] 0: mtry=11; min.node.size=5; sample.fraction=0.79 : y = 1.63e+10 : 31.4 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=14; sample.fraction=0.506 : y = 2.19e+10 : 5.0 secs : initdesign
## [mbo] 0: mtry=14; min.node.size=412; sample.fraction=0.738 : y = 3.43e+10 : 6.8 secs : initdesign
## [mbo] 0: mtry=14; min.node.size=2; sample.fraction=0.585 : y = 1.65e+10 : 28.1 secs : initdesign
## [mbo] 0: mtry=6; min.node.size=157; sample.fraction=0.38 : y = 3.24e+10 : 3.3 secs : initdesign
## [mbo] 0: mtry=23; min.node.size=62; sample.fraction=0.541 : y = 2.03e+10 : 16.4 secs : initdesign
## [mbo] 0: mtry=16; min.node.size=198; sample.fraction=0.314 : y = 3.31e+10 : 4.3 secs : initdesign
## [mbo] 0: mtry=20; min.node.size=3; sample.fraction=0.405 : y = 1.69e+10 : 22.3 secs : initdesign
## [mbo] 0: mtry=5; min.node.size=4; sample.fraction=0.229 : y = 2.07e+10 : 5.2 secs : initdesign
## [mbo] 0: mtry=18; min.node.size=41; sample.fraction=0.811 : y = 1.83e+10 : 22.7 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=2; sample.fraction=0.669 : y = 1.81e+10 : 13.5 secs : initdesign
## [mbo] 0: mtry=23; min.node.size=1.89e+03; sample.fraction=0.338 : y = 6.23e+10 : 3.4 secs : initdesign
## [mbo] 0: mtry=7; min.node.size=569; sample.fraction=0.621 : y = 3.82e+10 : 2.8 secs : initdesign
## [mbo] 0: mtry=25; min.node.size=32; sample.fraction=0.206 : y = 2.14e+10 : 8.4 secs : initdesign
## [mbo] 0: mtry=8; min.node.size=2.33e+03; sample.fraction=0.267 : y = 6.58e+10 : 0.9 secs : initdesign
## [mbo] 0: mtry=24; min.node.size=3; sample.fraction=0.857 : y = 1.81e+10 : 54.4 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=20; sample.fraction=0.887 : y = 4.33e+10 : 2.3 secs : initdesign
## [mbo] 0: mtry=22; min.node.size=924; sample.fraction=0.698 : y = 4.13e+10 : 6.3 secs : initdesign
## [mbo] 0: mtry=13; min.node.size=17; sample.fraction=0.486 : y = 1.76e+10 : 13.5 secs : initdesign
## [mbo] 1: mtry=12; min.node.size=3; sample.fraction=0.622 : y = 1.66e+10 : 27.3 secs : infill_cb
## [mbo] 2: mtry=17; min.node.size=2; sample.fraction=0.833 : y = 1.66e+10 : 60.9 secs : infill_cb
## [mbo] 3: mtry=8; min.node.size=2; sample.fraction=0.448 : y = 1.74e+10 : 20.9 secs : infill_cb
## [mbo] 4: mtry=16; min.node.size=2; sample.fraction=0.782 : y = 1.65e+10 : 41.1 secs : infill_cb
## [mbo] 5: mtry=10; min.node.size=2; sample.fraction=0.728 : y = 1.65e+10 : 37.0 secs : infill_cb
## [mbo] 6: mtry=19; min.node.size=5; sample.fraction=0.525 : y = 1.66e+10 : 35.4 secs : infill_cb
## [mbo] 7: mtry=10; min.node.size=4; sample.fraction=0.508 : y = 1.69e+10 : 15.7 secs : infill_cb
## [mbo] 8: mtry=25; min.node.size=2; sample.fraction=0.224 : y = 1.81e+10 : 21.9 secs : infill_cb
## [mbo] 9: mtry=20; min.node.size=4; sample.fraction=0.795 : y = 1.7e+10 : 63.9 secs : infill_cb
## [mbo] 10: mtry=19; min.node.size=2; sample.fraction=0.212 : y = 1.83e+10 : 20.7 secs : infill_cb
## [mbo] 11: mtry=18; min.node.size=12; sample.fraction=0.604 : y = 1.69e+10 : 36.9 secs : infill_cb
## [mbo] 12: mtry=17; min.node.size=6; sample.fraction=0.44 : y = 1.69e+10 : 31.9 secs : infill_cb
## [mbo] 13: mtry=13; min.node.size=2; sample.fraction=0.815 : y = 1.65e+10 : 43.7 secs : infill_cb
## [mbo] 14: mtry=21; min.node.size=2; sample.fraction=0.452 : y = 1.67e+10 : 30.7 secs : infill_cb
## [mbo] 15: mtry=8; min.node.size=5; sample.fraction=0.588 : y = 1.7e+10 : 15.6 secs : infill_cb
## [mbo] 16: mtry=14; min.node.size=5; sample.fraction=0.761 : y = 1.63e+10 : 34.1 secs : infill_cb
## [mbo] 17: mtry=5; min.node.size=2; sample.fraction=0.526 : y = 1.8e+10 : 11.4 secs : infill_cb
## [mbo] 18: mtry=11; min.node.size=5; sample.fraction=0.668 : y = 1.65e+10 : 20.2 secs : infill_cb
## [mbo] 19: mtry=25; min.node.size=7; sample.fraction=0.472 : y = 1.7e+10 : 25.8 secs : infill_cb
## [mbo] 20: mtry=25; min.node.size=2; sample.fraction=0.515 : y = 1.68e+10 : 38.0 secs : infill_cb
## [mbo] 21: mtry=18; min.node.size=2; sample.fraction=0.473 : y = 1.67e+10 : 26.6 secs : infill_cb
## [mbo] 22: mtry=12; min.node.size=2; sample.fraction=0.743 : y = 1.65e+10 : 33.1 secs : infill_cb
## [mbo] 23: mtry=19; min.node.size=10; sample.fraction=0.484 : y = 1.68e+10 : 23.5 secs : infill_cb
## [mbo] 24: mtry=10; min.node.size=3; sample.fraction=0.773 : y = 1.63e+10 : 26.1 secs : infill_cb
## [mbo] 25: mtry=11; min.node.size=2; sample.fraction=0.559 : y = 1.65e+10 : 26.2 secs : infill_cb
## [mbo] 26: mtry=14; min.node.size=2; sample.fraction=0.433 : y = 1.71e+10 : 23.0 secs : infill_cb
## [mbo] 27: mtry=15; min.node.size=7; sample.fraction=0.591 : y = 1.65e+10 : 23.2 secs : infill_cb
## [mbo] 28: mtry=16; min.node.size=6; sample.fraction=0.827 : y = 1.65e+10 : 35.2 secs : infill_cb
## [mbo] 29: mtry=17; min.node.size=6; sample.fraction=0.797 : y = 1.66e+10 : 32.2 secs : infill_cb
## [mbo] 30: mtry=18; min.node.size=35; sample.fraction=0.611 : y = 1.82e+10 : 17.1 secs : infill_cb
## [mbo] 31: mtry=9; min.node.size=2; sample.fraction=0.612 : y = 1.67e+10 : 19.0 secs : infill_cb
## [mbo] 32: mtry=12; min.node.size=4; sample.fraction=0.793 : y = 1.63e+10 : 27.8 secs : infill_cb
## [mbo] 33: mtry=11; min.node.size=6; sample.fraction=0.529 : y = 1.68e+10 : 15.7 secs : infill_cb
## [mbo] 34: mtry=11; min.node.size=3; sample.fraction=0.736 : y = 1.61e+10 : 25.0 secs : infill_cb
## [mbo] 35: mtry=22; min.node.size=3; sample.fraction=0.491 : y = 1.67e+10 : 30.4 secs : infill_cb
## [mbo] 36: mtry=15; min.node.size=3; sample.fraction=0.815 : y = 1.64e+10 : 34.7 secs : infill_cb
## [mbo] 37: mtry=21; min.node.size=6; sample.fraction=0.582 : y = 1.67e+10 : 30.6 secs : infill_cb
## [mbo] 38: mtry=11; min.node.size=5; sample.fraction=0.721 : y = 1.63e+10 : 21.5 secs : infill_cb
## [mbo] 39: mtry=11; min.node.size=3; sample.fraction=0.589 : y = 1.65e+10 : 19.0 secs : infill_cb
## [mbo] 40: mtry=13; min.node.size=3; sample.fraction=0.793 : y = 1.63e+10 : 28.4 secs : infill_cb
## [mbo] 41: mtry=16; min.node.size=3; sample.fraction=0.561 : y = 1.64e+10 : 25.0 secs : infill_cb
## [mbo] 42: mtry=16; min.node.size=4; sample.fraction=0.791 : y = 1.65e+10 : 32.1 secs : infill_cb
## [mbo] 43: mtry=19; min.node.size=2; sample.fraction=0.568 : y = 1.65e+10 : 32.6 secs : infill_cb
## [mbo] 44: mtry=11; min.node.size=3; sample.fraction=0.78 : y = 1.62e+10 : 25.9 secs : infill_cb
## [mbo] 45: mtry=11; min.node.size=2; sample.fraction=0.82 : y = 1.66e+10 : 30.7 secs : infill_cb
## [mbo] 46: mtry=15; min.node.size=2; sample.fraction=0.786 : y = 1.65e+10 : 41.0 secs : infill_cb
## [mbo] 47: mtry=11; min.node.size=3; sample.fraction=0.696 : y = 1.65e+10 : 23.7 secs : infill_cb
## [mbo] 48: mtry=12; min.node.size=5; sample.fraction=0.735 : y = 1.64e+10 : 28.2 secs : infill_cb
## [mbo] 49: mtry=11; min.node.size=4; sample.fraction=0.768 : y = 1.63e+10 : 23.2 secs : infill_cb
## [mbo] 50: mtry=25; min.node.size=2; sample.fraction=0.444 : y = 1.68e+10 : 32.3 secs : infill_cb
## [mbo] 51: mtry=14; min.node.size=3; sample.fraction=0.554 : y = 1.65e+10 : 22.1 secs : infill_cb
## [mbo] 52: mtry=11; min.node.size=3; sample.fraction=0.731 : y = 1.62e+10 : 23.9 secs : infill_cb
## [mbo] 53: mtry=24; min.node.size=2; sample.fraction=0.545 : y = 1.67e+10 : 40.3 secs : infill_cb
## [mbo] 54: mtry=11; min.node.size=4; sample.fraction=0.74 : y = 1.64e+10 : 23.4 secs : infill_cb
## [mbo] 55: mtry=11; min.node.size=3; sample.fraction=0.763 : y = 1.64e+10 : 24.1 secs : infill_cb
## [mbo] 56: mtry=13; min.node.size=5; sample.fraction=0.781 : y = 1.62e+10 : 26.7 secs : infill_cb
## [mbo] 57: mtry=13; min.node.size=5; sample.fraction=0.784 : y = 1.63e+10 : 28.1 secs : infill_cb
## [mbo] 58: mtry=10; min.node.size=3; sample.fraction=0.723 : y = 1.62e+10 : 23.5 secs : infill_cb
## [mbo] 59: mtry=10; min.node.size=2; sample.fraction=0.73 : y = 1.63e+10 : 29.3 secs : infill_cb
## [mbo] 60: mtry=15; min.node.size=2; sample.fraction=0.561 : y = 1.64e+10 : 50.4 secs : infill_cb
## [mbo] 61: mtry=14; min.node.size=6; sample.fraction=0.79 : y = 1.66e+10 : 42.1 secs : infill_cb
## [mbo] 62: mtry=12; min.node.size=3; sample.fraction=0.787 : y = 1.63e+10 : 38.4 secs : infill_cb
## [mbo] 63: mtry=13; min.node.size=5; sample.fraction=0.9 : y = 1.67e+10 : 48.6 secs : infill_cb
## [mbo] 64: mtry=13; min.node.size=3; sample.fraction=0.785 : y = 1.65e+10 : 32.5 secs : infill_cb
## [mbo] 65: mtry=10; min.node.size=3; sample.fraction=0.746 : y = 1.64e+10 : 26.9 secs : infill_cb
## [mbo] 66: mtry=17; min.node.size=2; sample.fraction=0.898 : y = 1.67e+10 : 50.6 secs : infill_cb
## [mbo] 67: mtry=12; min.node.size=4; sample.fraction=0.728 : y = 1.66e+10 : 27.5 secs : infill_cb
## [mbo] 68: mtry=20; min.node.size=5; sample.fraction=0.897 : y = 1.73e+10 : 42.9 secs : infill_cb
## [mbo] 69: mtry=11; min.node.size=3; sample.fraction=0.795 : y = 1.63e+10 : 35.4 secs : infill_cb
## [mbo] 70: mtry=11; min.node.size=5; sample.fraction=0.774 : y = 1.63e+10 : 23.1 secs : infill_cb
# Mean of best 5 % of the results
res
## Recommended parameter settings: 
##   mtry min.node.size sample.fraction
## 1   11             3       0.7501049
## Results: 
##           mse exec.time
## 1 16194690823    25.012
# Model with the new tuned hyperparameters
res$model
## Model for learner.id=regr.ranger; learner.class=regr.ranger
## Trained on: task.id = final_train_df$train[, !c("id")]; obs = 13821; features = 25
## Hyperparameters: num.threads=2,verbose=FALSE,respect.unordered.factors=order,mtry=11,min.node.size=3,sample.fraction=0.75,num.trees=500,replace=FALSE
# Prediction
final <- predict(res$model, newdata = final_train_df$test[,!c('id')])$data$response
## Warning in predict.WrappedModel(res$model, newdata =
## final_train_df$test[, : Provided data for prediction is not a pure
## data.frame but from class data.table, hence it will be converted.
df_pipeline_pred<-cbind(df_pipeline_pred, final)

result<-data.table(method=c('baseline','fe1','fe2','fe3','fe4','fe5','fe_final','final'),
                   rmse=sapply(df_pipeline_pred[,!c('price','id')],
                               function(x) return(rmse(real=df_pipeline_pred$price, predicted=x))),
                   mae=sapply(df_pipeline_pred[,!c('price','id')],
                              function(x) return(mae(real=df_pipeline_pred$price, predicted=x))),
                   mape=sapply(df_pipeline_pred[,!c('price','id')],
                               function(x) return(mape(real=df_pipeline_pred$price, predicted=x))),
                   rsq=sapply(df_pipeline_pred[,!c('price','id')],
                               function(x) return(custom_rsq(real=df_pipeline_pred$price, predicted=x))))
ggplot(result, aes(x=method, y=mape))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rmse))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=mae))+geom_bar(stat='identity')

ggplot(result, aes(x=method, y=rsq))+geom_bar(stat='identity')

result[which.min(result$rmse)]
##    method     rmse      mae      mape       rsq
## 1:    fe1 142951.1 68111.46 0.1249845 0.8703131
result[which.min(result$mae)]
##    method     rmse      mae      mape       rsq
## 1:    fe1 142951.1 68111.46 0.1249845 0.8703131
result[which.min(result$mape)]
##    method     rmse      mae      mape       rsq
## 1:    fe1 142951.1 68111.46 0.1249845 0.8703131
result[which.max(result$rsq)]
##    method     rmse      mae      mape       rsq
## 1:    fe1 142951.1 68111.46 0.1249845 0.8703131

Retrain Model On Entire Data Set and Predict on Test Set

The following sections retrain the model on the entire training set, uses this retrained model to make predictions on the validation set, and finally prepares the predictions for a CSV format.

final_total_train<- rbind(final_train_df$train, final_train_df$test)

final_rf <- ranger(formula = as.formula(price~.), data=final_total_train[,!c('id')], importance = 'impurity')
print(final_rf)
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = final_total_train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      17277 
## Number of independent variables:  25 
## Mtry:                             5 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       18298130722 
## R squared (OOB):                  0.8637849
final_test_rf<-predict(final_rf, data = final_test_df, type='response')$predictions
prediction<-clean_test_df[, .(id=id,final_test_rf)]
head(prediction)
##            id final_test_rf
## 1: 6414100192      542501.8
## 2: 6054650070      394543.5
## 3:   16000397      215575.6
## 4: 2524049179     1256474.8
## 5: 8562750320      594682.0
## 6: 7589200193      527994.2

Variable Importance

This chart displays the variable importance sorted by node impurity (i.e. the variation generated when observations reach that variable). Many of the most important factors influencing house pricing are intuitive (i.e. the size/area has a clear positive relationship with price).

importance_df <- data.frame(final_rf$variable.importance)
setDT(importance_df, keep.rownames = TRUE)[]
##                   rn final_rf.variable.importance
##  1:         bedrooms                 2.404102e+13
##  2:        bathrooms                 1.549827e+14
##  3:      sqft_living                 4.032911e+14
##  4:         sqft_lot                 4.382731e+13
##  5:           floors                 1.537731e+13
##  6:       waterfront                 5.582239e+13
##  7:             view                 8.431374e+13
##  8:        condition                 1.339576e+13
##  9:            grade                 3.728574e+14
## 10:       sqft_above                 2.044475e+14
## 11:    sqft_basement                 5.866676e+13
## 12:         yr_built                 7.246881e+13
## 13:     yr_renovated                 1.234942e+13
## 14:          zipcode                 7.043960e+13
## 15:              lat                 2.815001e+14
## 16:             long                 1.129015e+14
## 17:    sqft_living15                 2.012947e+14
## 18:       sqft_lot15                 4.725469e+13
## 19:             year                 4.000019e+12
## 20:            month                 1.664257e+13
## 21:              day                 1.961279e+13
## 22:      day_of_week                 1.079092e+13
## 23:          weekend                 8.288447e+11
## 24:        renovated                 4.486807e+11
## 25: missing_ren_year                 4.809395e+12
##                   rn final_rf.variable.importance
colnames(importance_df) <- c('variable', 'importance')

ggplot(importance_df, aes(x=reorder(variable,importance), y=importance, fill=importance)) + 
    geom_bar(stat="identity", position="dodge")+ coord_flip()+
    ylab("Variable Importance")+
    xlab("")+
    ggtitle("Information Value Summary")+
    guides(fill=F)+
    scale_fill_gradient(low="red", high="blue")

CSV Output

colnames(prediction) <- c('id', 'target')
write.csv(prediction, file = "output.csv")